brfss_df =
p8105.datasets::brfss_smart2010 %>%
janitor::clean_names() %>% # cleaning
rename(location_abbr = locationabbr, location_desc = locationdesc) %>% # format the variable name
filter(topic == "Overall Health") %>% # focus on the “Overall Health” topic
filter(response %in% c("Poor", "Fair", "Good", "Very good", "Excellent")) %>% # choose response
arrange(response) # organize responses
In 2002, CT, FL, NC states were observed at 7 locations
spaghetti plot
brfss_df %>%
distinct(year, location_abbr, location_desc) %>%
group_by(year, location_abbr) %>%
summarise(n = n()) %>%
ggplot(aes(x = year, y = n, group = location_abbr, color = location_abbr)) +
geom_line() +
labs(
y = "number of observatons",
title = " number of observations in each state from 2002 to 2010"
) +
theme_bw()
brfss_df %>%
filter(year %in% c(2002, 2006, 2010) & location_abbr == "NY" & response == "Excellent") %>%
group_by(year) %>%
summarise(mean_exc = mean(data_value),
sd_exc = sd(data_value)) %>%
knitr::kable(digits = 1)
| year | mean_exc | sd_exc |
|---|---|---|
| 2002 | 24.0 | 4.5 |
| 2006 | 22.5 | 4.0 |
| 2010 | 22.7 | 3.6 |
brfss_df %>%
group_by(year, location_abbr, response) %>%
summarise(mean_prop = mean(data_value)) %>%
ggplot(aes(x = year, y = mean_prop, group = location_abbr, color = location_abbr)) +
geom_line() +
labs(y = "average proportion") +
facet_grid(response ~ ., scales = "free") + # ploting vertically and without the same axis scaling
theme_bw()
## Warning: Removed 2 rows containing missing values (geom_path).
insta_df = p8105.datasets::instacart %>%
janitor::clean_names()
There are 15 variables and 1384617 observations in this dataset. It contains details of every order: like user id: customer identifier, order id: order identifier, product id: product identifier, product name, reordered: whether it’s reordered or not, when it was ordered (assume 0 was assigned to Sunday). Also, in the local store, there are 21 departments.
insta_df %>% select(user_id, order_id, product_id, product_name, reordered, order_dow) %>% head(10) %>% knitr::kable()
| user_id | order_id | product_id | product_name | reordered | order_dow |
|---|---|---|---|---|---|
| 112108 | 1 | 49302 | Bulgarian Yogurt | 1 | 4 |
| 112108 | 1 | 11109 | Organic 4% Milk Fat Whole Milk Cottage Cheese | 1 | 4 |
| 112108 | 1 | 10246 | Organic Celery Hearts | 0 | 4 |
| 112108 | 1 | 49683 | Cucumber Kirby | 0 | 4 |
| 112108 | 1 | 43633 | Lightly Smoked Sardines in Olive Oil | 1 | 4 |
| 112108 | 1 | 13176 | Bag of Organic Bananas | 0 | 4 |
| 112108 | 1 | 47209 | Organic Hass Avocado | 0 | 4 |
| 112108 | 1 | 22035 | Organic Whole String Cheese | 1 | 4 |
| 79431 | 36 | 39612 | Grated Pecorino Romano Cheese | 0 | 6 |
| 79431 | 36 | 19660 | Spring Water | 1 | 6 |
There are 134 aisles in the store. fresh vegetables aisle is the most items ordered from.
a plot that shows the number of items ordered in each aisle
# a function for sorting the factors so that aisles will be listed in descending way
reorder_size <- function(x) {
factor(x, levels = names(sort(table(x))))
}
insta_df %>%
select(aisle) %>%
ggplot(aes(x = reorder_size(aisle))) +
geom_bar() +
scale_y_log10() +
labs(x = "aisles",
y = "log10(total number of items)",
title = "number of items ordered in each aisle") +
coord_flip()
insta_df %>%
filter(aisle %in% c("baking ingredients", "dog food care", "packaged vegetables fruits")) %>%
group_by(aisle, product_name) %>%
summarise(item_num = n()) %>%
group_by(aisle) %>%
mutate(item_rank = min_rank(desc(item_num))) %>% # calculate the rank
filter(item_rank == 1) %>%
knitr::kable(digits = 1)
| aisle | product_name | item_num | item_rank |
|---|---|---|---|
| baking ingredients | Light Brown Sugar | 499 | 1 |
| dog food care | Snack Sticks Chicken & Rice Recipe Dog Treats | 30 | 1 |
| packaged vegetables fruits | Organic Baby Spinach | 9784 | 1 |
for the consecutive number in order_dow, we assume 0 assign to Sunday, 1 assign to Monday and so on …
insta_df %>%
filter(product_name %in% c("Pink Lady Apples", "Coffee Ice Cream")) %>%
select(product_name, order_dow, order_hour_of_day) %>%
group_by(product_name, order_dow) %>%
summarise(mean_hour = mean(order_hour_of_day)) %>%
spread(key = order_dow, value = mean_hour) %>%
rename(Sunday = "0", Monday = "1", Tuesday = "2", Wednesday = "3", Thursday = "4", Friday = "5", Saturday = "6") %>% # rename it to make it readable
knitr::kable(digit = 1)
| product_name | Sunday | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday |
|---|---|---|---|---|---|---|---|
| Coffee Ice Cream | 13.8 | 14.3 | 15.4 | 15.3 | 15.2 | 12.3 | 13.8 |
| Pink Lady Apples | 13.4 | 11.4 | 11.7 | 14.2 | 11.6 | 12.8 | 11.9 |
noaa_df = p8105.datasets::ny_noaa %>%
janitor::clean_names()
There are 7 variables and 2595176 observations in this dataset. It contains details of everyday weatherr: like id: Weather station ID, date: Date of observation, prcp: Precipitation (tenths of mm), snow: Snowfall (mm), snwd: Snow depth (mm), tmax: aximum temperature (tenths of degrees C), tmin: Minimum temperature (tenths of degrees C).
Also we noticed that there are many NAs in the dataset, below are the table of the proportion of NAs in each column
noaa_df %>%
summarise(id_na = sum(is.na(id))/length(id),
date_na = sum(is.na(date))/length(date),
prcp_na = sum(is.na(prcp))/length(prcp),
snow_na = sum(is.na(snow))/length(snow),
snwd_na = sum(is.na(snwd))/length(snwd),
tmax_na = sum(is.na(tmax))/length(tmax),
tmin_na = sum(is.na(tmin))/length(tmin)
) %>%
knitr::kable(digits = 3)
| id_na | date_na | prcp_na | snow_na | snwd_na | tmax_na | tmin_na |
|---|---|---|---|---|---|---|
| 0 | 0 | 0.056 | 0.147 | 0.228 | 0.437 | 0.437 |
noaa_df = noaa_df %>%
separate(date, into = c("year", "month", "day"), sep = "-") %>% # separate the date
mutate(prcp = as.numeric(prcp)/10, tmax = as.numeric(tmax)/10, tmin = as.numeric(tmin)/10) %>% # change the unit
rename(prcp_in_mm = prcp, snow_in_mm = snow, snwd_in_mm = snwd, tmax_in_celsius = tmax, tmin_in_celsius = tmin)
Find the most commonly observed values for snowfall:
result_1 = noaa_df %>%
group_by(snow_in_mm) %>%
summarise(observe_num = n()) %>%
arrange(desc(observe_num)) %>% # decreasing order
.[1,]
The value is 0, and it has been observed 2008508 times.
noaa_df %>%
filter(month %in% c("01", "06")) %>% # choose
group_by(id, year, month) %>% # group by
summarise(mean_tmax = mean(tmax_in_celsius, na.rm = T)) %>% # calculate mean
filter(!is.na(mean_tmax)) %>% # remove NA
ggplot(aes(x = year, y = mean_tmax, group = id, color = id)) +
geom_line() +
facet_grid(month ~ ., scales = "free") +
theme(
legend.position = "bottom",
axis.text.x = element_text(size = 5),
legend.text = element_text(size = 5)
)
As we can see from the plot, it did follow some observable structure. The average temperature follows specific waving pattern: like in January 1994, the temperature in all stations reached their nadir, or in June 1999, the temperature in all stations reached their peak.
But we can also see some outliers: like in January 1982, USC00301723, -13.4387097 is an outlier, or in June 2004, USC00302454, 17.0066667 is an outlier too.